home *** CD-ROM | disk | FTP | other *** search
Text File | 1994-05-14 | 2.3 KB | 72 lines | [TEXT/KEEN] |
- #$WordFrequency: print sorted list of words, together with number
- #of times each word occurs. Use with any input option,
- #select "Show stdout" since that's where the output goes.
-
- #The file "common words" in the "hAWK programs" folder contains
- #a list of words to skip. To do a better job, you can create a
- #custom list - for example, the word "while" can be skipped in
- #ordinary text, but should be included if the text deals with
- #C or hAWK programming. If this file is missing, the program
- #will still run, but common words will not be skipped (this
- #uses a lot more memory).
- #
- #This isn't perfect, but is very useful as-is. It's a simple
- #program, one you can tinker with easily - try it out on
- #some small files, and refinements will
- #suggest themselves.
- #
- # User’s Manual references:
- # «hAWK User’s Manual» «F Running hAWK programs»
- # «hAWK User’s Manual» «L 5 Regular expressions»
- # «hAWK User’s Manual» «M 5 Built-in string and file functions»
- # «hAWK User’s Manual» «K 4 Built-in variables»
- # «hAWK User’s Manual» «K 8 Arrays»
- # «hAWK User’s Manual» «N User-defined functions»
- # «hAWK User’s Manual» «P 3 The getline function»
- # «hAWK User’s Manual» «O 3 Output into files»
- # «hAWK User’s Manual» «Q The hAWK function»
-
- BEGIN { #Get list of common words to skip.
- commonfile = STDPATH "Drag_on Modules:hAWK programs:" "common words"
- while (getline < commonfile > 0)
- {
- for ( k = 1; k <= NF; k++)
- common[$k] = 1; #Forces common[$k] to "exist".
- }
- close(commonfile)
- $0 = ""
- ## time_it = 1
- if (time_it == 1)
- print "Starting time", time()
- }
-
- #If you're debugging a hAWK program, you might want to
- #simplify output by uncommenting the pattern-action below
- #and saving this as "$hAWKWordFrequency" or somesuch.
-
- ## /^#/ {next} #skip lines containing hAWK comments
-
- { #Remove non-word characters, count words.
- gsub(/[^A-Za-z_0-9$'-]+/, " ")
- #or try gsub(/\W+/, " ") #W == [^A-Z_a-z0-9]
- for ( k = 1; k <= NF; k++)
- {
- if (length($k) > 1 && !($k in common))
- count[$k]++;
- }
- total_words += NF;
- }
- END { #Sort associative array, and print words with count.
- for (w in count)
- {
- linear[++m] = w "\t\t" count[w]
- }
- sort(linear, ind, "d")
- for (j = 1; j <= m; ++j)
- print linear[ind[j]]
- print "Total words:", total_words;
- if (time_it == 1)
- print "Finishing time", time()
- }
-
-